#######################################################
# Replication File for:                               #
# Jankowski/Huber | "When Correlation Is Not Enough"  #
# Published in: Political Analysis                    #
# Contact 1: michael.jankowski@uol.de                 #
# Contact 2: robertalexander.huber@plus.ac.at         #
#######################################################

#######################################################
# Please note: All models were estimated using python.
# To replicate the models, you first have to
# run all relevant python files! From the python analyses
# several objects are exported and then loaded into R.
# R is only used for visualizations/tables or some simple 
# computations based on the python code.
# As we included the python output in the replication files
# this script should run even when you do not run the python files.
#######################################################

#######################################################
# (1) Installing required packages:
#######################################################

pkgs <- c("tidyverse",
          "data.table",
          "visreg",
          "xtable",
          "broom",
          "rio",
          "texreg",
          "reticulate",
          "xml2",
          "here",
          "dataverse",
          "rvest")

# Function to check if packages are installed
# If not: package will be installed from CRAN and then loaded
# If: Package will be loaded

install_load <- function(packages){
  
  for (p in packages) {
    cat("Check package: '", p, "'...\n", sep = "")
    flush.console()
    
    if (p %in% rownames(installed.packages())) {
      
      cat("Package: '", p, "' is already installed...\n\n", sep = "")
      flush.console()
      
      library(p, character.only=TRUE)
      
    } else {
      
      cat("Package: '", p, "' is NOT installed! Will install now...\n\n", sep = "")
      install.packages(p)
      library(p,character.only = TRUE)
      
    }
  }
  cat("\nAll packages installed!\n\n")
}

# Apply function to all required packages

install_load(pkgs)

# Install relevant python packages:

py_install("pandas")

#######################################################
# (2) Set Working Directory using here() package in project
#######################################################

here::i_am("master_replication.R")

# Clean Environment

rm(list = ls())

#######################################################
# (3) Untar all (Jankowski and Huber) tar-files
#######################################################

## Run this code to untar all folders!
## This needs to be run prior to all other R Scripts!

library(here)

all_tar_files <- list.files(here(),
                            pattern = "\\.tar$")

lapply(all_tar_files, function(x){
  
  untar(here(x))
  
})

rm(list = ls())

#######################################################
# (4) Download DCM's replication data from dataverse
# ([size: approx. 20gb] this takes some minutes)
#######################################################

dir.create(here("dcm_dataverse"))

files <- list("00_generate_bag_of_words.ipynb" = "https://dataverse.harvard.edu/api/access/datafile/4756350",
           "01_train_all_models.py" = "https://dataverse.harvard.edu/api/access/datafile/4756365",
           "01_train_model.ipynb" = "https://dataverse.harvard.edu/api/access/datafile/4756351",
           "02_compute_scores.ipynb" = "https://dataverse.harvard.edu/api/access/datafile/4756352",
           "03_results.ipynb" = "https://dataverse.harvard.edu/api/access/datafile/4756353",
           "CHES_data.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756355",
           "GPD_data.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756357",
           "main_text_figures.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756358",
           "models.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756366",
           "POPPA_data.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756359",
           "README.txt" = "https://dataverse.harvard.edu/api/access/datafile/4756360",
           "SI_figures.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756362",
           "training_results.json" = "https://dataverse.harvard.edu/api/access/datafile/4756363",
           "train_configurations.py" = "https://dataverse.harvard.edu/api/access/datafile/4756364",
           "bow_and_labels.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756354",
           "datasets.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756356",
           "scores.tar.gz" = "https://dataverse.harvard.edu/api/access/datafile/4756361")

options(timeout=2000) # (increase time for download, default of R = 60sec.)

for(i in 1:length(files)){

  fname <- names(files)[i]
  flink <- files[[i]]

  cat("Download of file: ", fname,"(",i," of ", length(files), " files)\n", sep = ""); flush.console()
  
  download.file(flink,
                destfile = here("dcm_dataverse", fname),
                mode = "wb")

}

cat("\n\nAll files downloaded!\n\nNow unzipping tar files.\n\n")

#######################################################
# (5) Untar DCM Data (takes approx. 1min)
#######################################################

all_tar_files <- list.files(path = here("dcm_dataverse"),
                            pattern = "\\.tar\\.gz$",
                            full.names = TRUE)


lapply(all_tar_files, function(x){
  
  cat("Untar:", x, "\n\n"); flush.console()
  
  untar(x, 
        exdir = here("dcm_dataverse")) 
  
})

cat("All files downloaded and extracted.\n\n")

#######################################################
# (6) Replication of all R Tables/Figures.
#######################################################

#######################################################
# Table 1 (Top5 Feature Importance by Country) and Table A4 (feature importance for Austria)
# also includes some proportions mentioned in main text of section 3.2
#######################################################

eval(parse(here("R_Scripts","Table1.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Table 2 was created manually [no code necessary]
#######################################################

#######################################################
# Figure 1 (Austria Plot)
#######################################################

eval(parse(here("R_Scripts","Figure1.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Figure 2 (Density and Boxplot of Reshuffles)
# (German plot used for main text, others for appendix Section A9)
#######################################################

eval(parse(here("R_Scripts","Figure2.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Figure 3 (Reshuffle Correlation Histograms)
# (including some correlations mentioned in main text of section 4.3)
#######################################################

eval(parse(here("R_Scripts","Figure3.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A1 (LOGIT: POPPA X PopuList)
#######################################################

eval(parse(here("R_Scripts","AppendixA1.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A2 (VDEM X DCM, FE REGRESSION)
#######################################################

eval(parse(here("R_Scripts","AppendixA2.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A3 (Feature Importance based on SHAP Values)
#######################################################

eval(parse(here("R_Scripts","AppendixA3.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A4 (Correlation between Host Ideologies and Populism in the Six Countries)
#######################################################

eval(parse(here("R_Scripts","AppendixA4.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A5 (Coding Errors in Austria)
# Table A3 (wrongly assigned manifestos and labels) was created manually
# Table A4 (Feature Importance) was created in Code for Table 1 (see Table1.R)
# Figure A7 (SHAP values) was created in "AppendixA3.R"
#######################################################

#######################################################
# Appendix Section A6 (Coding Error in Germany)
#######################################################

eval(parse(here("R_Scripts","AppendixA6.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A7 (Effect of Removing Party Names on Model Performance in Germany)
# Table A5 (Change in F1 scores) was created based on python output
# Table A8 was manually translated from Table A7
#######################################################

eval(parse(here("R_Scripts","AppendixA7.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix Section A8 (Effect of Removing Party Names on Model Performance in NL)
# Table A9 (Change in F1 scores) was created based on python output
# Table A12 was manually translated from Table A11
#######################################################

eval(parse(here("R_Scripts","AppendixA8.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
# Appendix A9 (Reshuffling Results for all Countries)
# These plots were already created in "Figure2.R" (see above)
#######################################################

#######################################################
# Appendix Section A10 (Feature Importance based on MIP (Top50))
#######################################################

eval(parse(here("R_Scripts","AppendixA10.R"),
           encoding = "UTF-8"))

rm(list = ls())

#######################################################
#######################################################
# End of R Script #####################################
#######################################################
#######################################################



